Exploratory Block Group Analysis

# read prison boundaries file from Homeland Infrastructure Foundation-Level Data Platform
# downloaded in geodatabase form and unzipped, read as sf object
# link: https://hifld-geoplatform.opendata.arcgis.com/datasets/geoplatform::prison-boundaries/about
prison_boundaries <- st_read("./1075f3ca-0050-4264-82e2-079a2daf2ec5.gdb", quiet = TRUE)


# read results that contain block assignment for each facility saved in csv
prison_boundaries_blocks <- read_csv("./prison_blocks.csv")

# for prison boundaries file, get list of all state county combinations represented, including the
# codes for the corresponding states and counties
# resulting data frame has 4 columns --> 
#        COUNTYFIPS -- 5 digit code consisting of state code and county code
#        STATE -- 2 chracter digit state abbreviation 
#        STATEFIPS -- 2 digit state FIPS
#        COUNTYCODE -- 3 digit county code
state_county_list <- st_drop_geometry(prison_boundaries) %>% 
  group_by(COUNTYFIPS, STATE) %>% 
  summarize() %>% 
  select( COUNTYFIPS, STATE) %>%
  mutate(STATE = as.character(STATE),
         COUNTYFIPS = as.character(COUNTYFIPS),
         STATEFIPS = substr(COUNTYFIPS, 1, 2),
         COUNTYCODE= substr(COUNTYFIPS, 
                             nchar(COUNTYFIPS) - 2, 
                             nchar(COUNTYFIPS)))  


# remove the NOT AVAILABLE entry  
state_county_list_filtered <- state_county_list[state_county_list$COUNTYFIPS != "NOT AVAILABLE",]


# get all state fips codes in prison boundaries data 
state_codes <- state_county_list_filtered %>% 
  group_by(STATEFIPS) %>% 
  summarize() %>% 
  pull(STATEFIPS)


# can use this function to see what variables are available for the given dataset
# listCensusMetadata(name = "pdb/blockgroup", vintage = 2021, type = "variables", group = NULL)


# get census data for the variables listed at the blockgroup level; block level not available 
get_census_data_safe <- function(state_code) {
  region_for_state = paste0("state:", state_code, "+county:*+tract:*")
  result <- tryCatch( 
    {
    getCensus(name = "pdb/blockgroup",
                      vintage = 2021,
                      key = "fbbc8b0d3e53089da7cabc380628d6d46ae00444",
                      vars = c("Block_group",
                               "County", 
                               "URBANIZED_AREA_POP_CEN_2010",
                               "Tot_Population_CEN_2010", 
                               "Tot_Population_ACS_15_19", 
                               "Males_ACS_15_19", 
                               "Females_ACS_15_19",
                               "Median_Age_ACS_15_19", 
                               "Hispanic_ACS_15_19",
                               "NH_White_alone_ACS_15_19",
                               "NH_Blk_alone_ACS_15_19",
                               "NH_AIAN_alone_ACS_15_19",
                               "NH_Asian_alone_ACS_15_19",
                               "NH_NHOPI_alone_ACS_15_19", 
                               "NH_SOR_alone_ACS_15_19",
                               "Pov_Univ_ACS_15_19",
                               "Prs_Blw_Pov_Lev_ACS_15_19",
                               "No_Health_Ins_ACS_15_19",
                               "Tot_Occp_Units_ACS_15_19",
                               "Aggregate_HH_INC_ACS_15_19",
                               "Med_HHD_Inc_BG_ACS_15_19",
                               "Tot_Prns_in_HHD_ACS_15_19",
                               "Renter_Occp_HU_ACS_15_19",
                               "Med_House_Value_BG_ACS_15_19",
                               "pct_ENG_VW_ACSMOE_15_19",
                               "pct_URBANIZED_AREA_POP_CEN_2010",
                               "pct_RURAL_POP_CEN_2010",
                               "pct_Inst_GQ_CEN_2010",
                               "pct_Inst_GQ_CEN_2010",
                               "pct_Non_Inst_GQ_CEN_2010",
                               "pct_Non_Inst_GQ_CEN_2010",
                               "pct_URBAN_CLUSTER_POP_CEN_2010"
                               ),
                      regionin = region_for_state,
                      region = "block group:*")
          } ,
      error=function(e) {
            return()
        } 
      )
  return(result)
}

# get block group level data for every state in state_codes - only run when making changes to the variables included in the function above
# blockgroup_level_data <- map_dfr(state_codes,  ~get_census_data_safe(.x))

# remove of redundant columns
#blockgroup_level_data <- blockgroup_level_data %>% 
#  select(-Block_group, County)

# write_csv(blockgroup_level_data, "./blockgroup_level_data.csv")

blockgroup_level_data <- read_csv("./blockgroup_level_data.csv")

# get block group number, which is the first digit of the block number 
prison_boundaries_blocks <- prison_boundaries_blocks %>% 
  mutate(block_group = as.numeric(substr(BLOCKCE10, 1,1)))

# join prison boundaries data to the census data 
prison_boundaries_block_census <- left_join(prison_boundaries_blocks, blockgroup_level_data,
                                            by = c("STATEFP10" = "state",
                                                   "COUNTYFP10" = "county",
                                                   "TRACTCE10" = "tract",
                                                   "block_group" = "block_group"))


# all variables in the dataset 
all_vars <- prison_boundaries_block_census %>% colnames()

Variable definitions are from the documentation for the 2021 Planning Database.

First, we can look at the distributions of the median age in block groups where facilities are located.

Distribution of Median Age

Distribution of the Percentage Urbanized

As defined in the documentation, the variable definitions are as follows:
* pct_URBANIZED_AREA_POP_CEN_2010: “percentage of the 2010 Census total population that lives in a densely settled area containing 50,000 or more people”
* pct_URBAN_CLUSTER_POP_CEN_2010: “percentage of the 2010 Census total population that lives in a densely settled area containing 2,500 to 49,999 people” * pct_RURAL_POP_CEN_2010: “percentage of the 2010 Census total population that lives outside of an urbanized area or urban cluster”

Percentage of Facilities in a Block Group that was Mostly Rural or Mostly Urban
Rural block groups are defined here as those where greater than 50% of the 2010 Census total population for that block group
was recorded to live outside of an urbanized area or urban cluster
Data from the 2010 Census
Density Percentage
Rural 31.12%
Urban 68.25%
Missing 0.64%
For Block Groups in the United States: Mean and Median Percentages in Urbanized, Urban Cluster, and Rural Areas
Data from the 2010 Census
Population Density Category Mean Percentage Median Percentage
Urban 69.00312 100
Urban Cluster 10.30077 0
Rural 20.69611 0
For Block Groups Where Facilities are Located: Mean and Median Percentages in Urbanized, Urban Cluster, and Rural Areas
Data from the 2010 Census
Population Density Category Mean Percentage Median Percentage
Urban 36.63423 0.0
Urban Cluster 29.42237 0.0
Rural 33.94341 7.6

We can also see how the distribution of age varies depending on whether blocks are primarily urban or rural, among different facility types, and among different facility security levels.

Distribution of Racial Percentages for each Racial Category

# prison_blocks_race

# 
#  mutate(sum_all = Hispanic_ACS_15_19 + NH_AIAN_alone_ACS_15_19 + NH_Asian_alone_ACS_15_19 + NH_Blk_alone_ACS_15_19 + NH_NHOPI_alone_ACS_15_19 + NH_SOR_alone_ACS_15_19 + NH_White_alone_ACS_15_19) %>% View()

# for race reference csv, copied and pasted definitions at link here https://api.census.gov/data/2021/pdb/blockgroup/variables.html into excel file 
race_ref <- read_csv("./races_reference.csv", col_names = FALSE) %>% 
  select(X1, X2) %>% 
  rename(`Racial Category` = X1,
         Description = X2) %>%
  mutate(Description = str_replace(Description, "in the ACS", ""),
         Description = str_replace(Description, ",", ",\n")) %>%
  filter(!str_detect(Description, "Census") & !str_detect(Description, "MOE") )

blockgroup_level_data %>% 
  select(
         Hispanic_ACS_15_19, NH_AIAN_alone_ACS_15_19, 
         NH_Asian_alone_ACS_15_19, NH_Blk_alone_ACS_15_19, 
         NH_NHOPI_alone_ACS_15_19, NH_SOR_alone_ACS_15_19,
         NH_White_alone_ACS_15_19, Tot_Population_ACS_15_19) %>%
  mutate(across(1:ncol(.), ~ (.x / Tot_Population_ACS_15_19) * 100)) %>% # get percentage by dividing by total for that block
  select(-Tot_Population_ACS_15_19) %>%
  pivot_longer(1:ncol(.), names_to = "Racial Category", values_to  ="Percentage") %>%
  left_join(race_ref, by = c("Racial Category" = "Racial Category")) %>%
  group_by(Description) %>%
  mutate(Median = median(Percentage, na.rm = TRUE),
         Mean = mean(Percentage, na.rm = TRUE)) %>%
  ggplot(aes(x = Percentage)) + 
  geom_histogram(alpha = .9) + 
  geom_vline(aes(xintercept = Median), color = "darkred") +
  geom_vline(aes(xintercept = Mean), color = "#718BCE") +
  facet_wrap(~Description, scales = "free_y", ncol = 2) +
  scale_x_continuous(breaks = seq(0, 100, by = 10)) +
  theme_bw() + 
  labs(y = "Number of Block Groups", 
       x = "Percentage",
       subtitle = "Median in Red, Median in Blue\n2015 – 2019 5-year ACS sample data",
       title = "Distribution of Racial Percentages in Block Groups in the U.S.") +
  theme(text = element_text(family = "Optima"),
        plot.subtitle = element_text(face = "italic", hjust = .5),
        plot.title = element_text(face = "bold", hjust = 0.5))

Distribution of Percentage without Health Insurance

Distribution of Percentages Below the Poverty Level

Referencing the documentation:

Prs_Blw_Pov_Lev_ACS_15_19 is defined as the “Number of people classified as below the poverty level given their total family or household income within the last year, family size, and family composition in the ACS population”.

To calculate the percentage, we can use the Pov_Univ_ACS_15_19 variable, which is defined as the “Population for whom poverty level is determined”.

Distribution of the Percentage of Housing Units Rented

Renter_Occp_HU_ACS_15_19 is defined as “Number of ACS occupied housing units that are not owner occupied, whether they are rented or occupied without payment of rent”.

Distribution of Average Household Size

Med_House_Value_BG_ACS_15_19 is defined as the “Median of ACS respondents’ house value estimates per block group”.

Distribution of Average Household Size